This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(Metrics)
library(readr)
library(ggplot2)#for visualisation
library(corrplot)#for visualisation of correlation
## corrplot 0.92 loaded
library(mlbench)
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(plotly)#converting ggplot to plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(reshape2)
library(lattice)
library(caret)
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
library(caTools)#for splittind data into testing and training data
library(dplyr) #manipulating dataframe
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mlbench)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data <- read_csv("C:/Users/FD_gi/Documents/Regression lineal/data/kc-house-data.csv")
## Rows: 21613 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): id
## dbl (19): price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterf...
## dttm (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(data)
## spc_tbl_ [21,613 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : chr [1:21613] "7129300520" "6414100192" "5631500400" "2487200875" ...
## $ date : POSIXct[1:21613], format: "2014-10-13" "2014-12-09" ...
## $ price : num [1:21613] 221900 538000 180000 604000 510000 ...
## $ bedrooms : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : num [1:21613] 1180 2570 770 1960 1680 ...
## $ sqft_lot : num [1:21613] 5650 7242 10000 5000 8080 ...
## $ floors : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ view : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : num [1:21613] 1180 2170 770 1050 1680 ...
## $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : num [1:21613] 1955 1951 1933 1965 1987 ...
## $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
## $ zipcode : num [1:21613] 98178 98125 98028 98136 98074 ...
## $ lat : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num [1:21613] -122 -122 -122 -122 -122 ...
## $ sqft_living15: num [1:21613] 1340 1690 2720 1360 1800 ...
## $ sqft_lot15 : num [1:21613] 5650 7639 8062 5000 7503 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_character(),
## .. date = col_datetime(format = ""),
## .. price = col_double(),
## .. bedrooms = col_double(),
## .. bathrooms = col_double(),
## .. sqft_living = col_double(),
## .. sqft_lot = col_double(),
## .. floors = col_double(),
## .. waterfront = col_double(),
## .. view = col_double(),
## .. condition = col_double(),
## .. grade = col_double(),
## .. sqft_above = col_double(),
## .. sqft_basement = col_double(),
## .. yr_built = col_double(),
## .. yr_renovated = col_double(),
## .. zipcode = col_double(),
## .. lat = col_double(),
## .. long = col_double(),
## .. sqft_living15 = col_double(),
## .. sqft_lot15 = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
dim(data)
## [1] 21613 21
missmap(data,col=c('yellow','black'),y.at=1,y.labels='',legend=TRUE)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.
#Checking for NA and missing values and removing them.
numberOfNA <- length(which(is.na(data)==T))
numberOfNA
## [1] 0
# Remove NA values
data <- data %>%
drop_na()
str(data)
## tibble [21,613 × 21] (S3: tbl_df/tbl/data.frame)
## $ id : chr [1:21613] "7129300520" "6414100192" "5631500400" "2487200875" ...
## $ date : POSIXct[1:21613], format: "2014-10-13" "2014-12-09" ...
## $ price : num [1:21613] 221900 538000 180000 604000 510000 ...
## $ bedrooms : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : num [1:21613] 1180 2570 770 1960 1680 ...
## $ sqft_lot : num [1:21613] 5650 7242 10000 5000 8080 ...
## $ floors : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ view : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : num [1:21613] 1180 2170 770 1050 1680 ...
## $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : num [1:21613] 1955 1951 1933 1965 1987 ...
## $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
## $ zipcode : num [1:21613] 98178 98125 98028 98136 98074 ...
## $ lat : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num [1:21613] -122 -122 -122 -122 -122 ...
## $ sqft_living15: num [1:21613] 1340 1690 2720 1360 1800 ...
## $ sqft_lot15 : num [1:21613] 5650 7639 8062 5000 7503 ...
dim(data)
## [1] 21613 21
data <- data %>% dplyr::select(-c(date))
#Drop id
data$id <- NULL
library(corrplot)
str(data)
## tibble [21,613 × 19] (S3: tbl_df/tbl/data.frame)
## $ price : num [1:21613] 221900 538000 180000 604000 510000 ...
## $ bedrooms : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : num [1:21613] 1180 2570 770 1960 1680 ...
## $ sqft_lot : num [1:21613] 5650 7242 10000 5000 8080 ...
## $ floors : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ view : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : num [1:21613] 1180 2170 770 1050 1680 ...
## $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : num [1:21613] 1955 1951 1933 1965 1987 ...
## $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
## $ zipcode : num [1:21613] 98178 98125 98028 98136 98074 ...
## $ lat : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num [1:21613] -122 -122 -122 -122 -122 ...
## $ sqft_living15: num [1:21613] 1340 1690 2720 1360 1800 ...
## $ sqft_lot15 : num [1:21613] 5650 7639 8062 5000 7503 ...
corrplot(cor(data))
corrplot(cor(data),method='number')
# Highly correlated variables
correlated <- cor(data)
##The caret findCorrelation evaluates pair-wise correlations across all variables, flagging variables that are highly correlated. Of the identified pairs, the function recommends the removal of the variable with the highest average absolute correlation across the dataset.
highCorr <- findCorrelation(correlated, cutoff=0.80)
highCorr
## [1] 4
names(data[highCorr])
## [1] "sqft_living"
summary(data)
## price bedrooms bathrooms sqft_living
## Min. : 75000 Min. : 0.000 Min. :0.000 Min. : 290
## 1st Qu.: 321950 1st Qu.: 3.000 1st Qu.:1.750 1st Qu.: 1427
## Median : 450000 Median : 3.000 Median :2.250 Median : 1910
## Mean : 540088 Mean : 3.371 Mean :2.115 Mean : 2080
## 3rd Qu.: 645000 3rd Qu.: 4.000 3rd Qu.:2.500 3rd Qu.: 2550
## Max. :7700000 Max. :33.000 Max. :8.000 Max. :13540
## sqft_lot floors waterfront view
## Min. : 520 Min. :1.000 Min. :0.000000 Min. :0.0000
## 1st Qu.: 5040 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 7618 Median :1.500 Median :0.000000 Median :0.0000
## Mean : 15107 Mean :1.494 Mean :0.007542 Mean :0.2343
## 3rd Qu.: 10688 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :1651359 Max. :3.500 Max. :1.000000 Max. :4.0000
## condition grade sqft_above sqft_basement
## Min. :1.000 Min. : 1.000 Min. : 290 Min. : 0.0
## 1st Qu.:3.000 1st Qu.: 7.000 1st Qu.:1190 1st Qu.: 0.0
## Median :3.000 Median : 7.000 Median :1560 Median : 0.0
## Mean :3.409 Mean : 7.657 Mean :1788 Mean : 291.5
## 3rd Qu.:4.000 3rd Qu.: 8.000 3rd Qu.:2210 3rd Qu.: 560.0
## Max. :5.000 Max. :13.000 Max. :9410 Max. :4820.0
## yr_built yr_renovated zipcode lat
## Min. :1900 Min. : 0.0 Min. :98001 Min. :47.16
## 1st Qu.:1951 1st Qu.: 0.0 1st Qu.:98033 1st Qu.:47.47
## Median :1975 Median : 0.0 Median :98065 Median :47.57
## Mean :1971 Mean : 84.4 Mean :98078 Mean :47.56
## 3rd Qu.:1997 3rd Qu.: 0.0 3rd Qu.:98118 3rd Qu.:47.68
## Max. :2015 Max. :2015.0 Max. :98199 Max. :47.78
## long sqft_living15 sqft_lot15
## Min. :-122.5 Min. : 399 Min. : 651
## 1st Qu.:-122.3 1st Qu.:1490 1st Qu.: 5100
## Median :-122.2 Median :1840 Median : 7620
## Mean :-122.2 Mean :1987 Mean : 12768
## 3rd Qu.:-122.1 3rd Qu.:2360 3rd Qu.: 10083
## Max. :-121.3 Max. :6210 Max. :871200
# Remove all auxiliary information and data transformation (num -> factor)
factorfun <- function(x){
x <- as.factor(x)
}
newdata <- data %>% mutate_at(c("waterfront", "view", "condition"), factorfun)
# Replace variables of yr_built and yr_renovated with age
for (i in 1:nrow(newdata)){
if(newdata$yr_renovated[i]!=0)
newdata$yr_built[i] <- newdata$yr_renovated[i]
}
newdata$age <- 2020 - newdata$yr_built
newdata <- newdata %>% dplyr::select(price, bedrooms, bathrooms, sqft_living, sqft_lot, floors, waterfront,
view, condition, age, lat, long)
str(newdata)
## tibble [21,613 × 12] (S3: tbl_df/tbl/data.frame)
## $ price : num [1:21613] 221900 538000 180000 604000 510000 ...
## $ bedrooms : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living: num [1:21613] 1180 2570 770 1960 1680 ...
## $ sqft_lot : num [1:21613] 5650 7242 10000 5000 8080 ...
## $ floors : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ view : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ condition : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 5 3 3 3 3 3 3 ...
## $ age : num [1:21613] 65 29 87 55 33 19 25 57 60 17 ...
## $ lat : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num [1:21613] -122 -122 -122 -122 -122 ...
dim(newdata)
## [1] 21613 12
## Remove any missings.
newdata <- newdata[rowSums(is.na(newdata))==0,]
dim(newdata)
## [1] 21613 12
library(corrplot)
str(data)
## tibble [21,613 × 19] (S3: tbl_df/tbl/data.frame)
## $ price : num [1:21613] 221900 538000 180000 604000 510000 ...
## $ bedrooms : num [1:21613] 3 3 2 4 3 4 3 3 3 3 ...
## $ bathrooms : num [1:21613] 1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
## $ sqft_living : num [1:21613] 1180 2570 770 1960 1680 ...
## $ sqft_lot : num [1:21613] 5650 7242 10000 5000 8080 ...
## $ floors : num [1:21613] 1 2 1 1 1 1 2 1 1 2 ...
## $ waterfront : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ view : num [1:21613] 0 0 0 0 0 0 0 0 0 0 ...
## $ condition : num [1:21613] 3 3 3 5 3 3 3 3 3 3 ...
## $ grade : num [1:21613] 7 7 6 7 8 11 7 7 7 7 ...
## $ sqft_above : num [1:21613] 1180 2170 770 1050 1680 ...
## $ sqft_basement: num [1:21613] 0 400 0 910 0 1530 0 0 730 0 ...
## $ yr_built : num [1:21613] 1955 1951 1933 1965 1987 ...
## $ yr_renovated : num [1:21613] 0 1991 0 0 0 ...
## $ zipcode : num [1:21613] 98178 98125 98028 98136 98074 ...
## $ lat : num [1:21613] 47.5 47.7 47.7 47.5 47.6 ...
## $ long : num [1:21613] -122 -122 -122 -122 -122 ...
## $ sqft_living15: num [1:21613] 1340 1690 2720 1360 1800 ...
## $ sqft_lot15 : num [1:21613] 5650 7639 8062 5000 7503 ...
corrplot(cor(data))
corrplot(cor(data),method='number')
# Highly correlated variables
correlated <- cor(data)
##The caret findCorrelation evaluates pair-wise correlations across all variables, flagging variables that are highly correlated. Of the identified pairs, the function recommends the removal of the variable with the highest average absolute correlation across the dataset.
highCorr <- findCorrelation(correlated, cutoff=0.80)
highCorr
## [1] 4
names(data[highCorr])
## [1] "sqft_living"
summary(data)
## price bedrooms bathrooms sqft_living
## Min. : 75000 Min. : 0.000 Min. :0.000 Min. : 290
## 1st Qu.: 321950 1st Qu.: 3.000 1st Qu.:1.750 1st Qu.: 1427
## Median : 450000 Median : 3.000 Median :2.250 Median : 1910
## Mean : 540088 Mean : 3.371 Mean :2.115 Mean : 2080
## 3rd Qu.: 645000 3rd Qu.: 4.000 3rd Qu.:2.500 3rd Qu.: 2550
## Max. :7700000 Max. :33.000 Max. :8.000 Max. :13540
## sqft_lot floors waterfront view
## Min. : 520 Min. :1.000 Min. :0.000000 Min. :0.0000
## 1st Qu.: 5040 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000
## Median : 7618 Median :1.500 Median :0.000000 Median :0.0000
## Mean : 15107 Mean :1.494 Mean :0.007542 Mean :0.2343
## 3rd Qu.: 10688 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000
## Max. :1651359 Max. :3.500 Max. :1.000000 Max. :4.0000
## condition grade sqft_above sqft_basement
## Min. :1.000 Min. : 1.000 Min. : 290 Min. : 0.0
## 1st Qu.:3.000 1st Qu.: 7.000 1st Qu.:1190 1st Qu.: 0.0
## Median :3.000 Median : 7.000 Median :1560 Median : 0.0
## Mean :3.409 Mean : 7.657 Mean :1788 Mean : 291.5
## 3rd Qu.:4.000 3rd Qu.: 8.000 3rd Qu.:2210 3rd Qu.: 560.0
## Max. :5.000 Max. :13.000 Max. :9410 Max. :4820.0
## yr_built yr_renovated zipcode lat
## Min. :1900 Min. : 0.0 Min. :98001 Min. :47.16
## 1st Qu.:1951 1st Qu.: 0.0 1st Qu.:98033 1st Qu.:47.47
## Median :1975 Median : 0.0 Median :98065 Median :47.57
## Mean :1971 Mean : 84.4 Mean :98078 Mean :47.56
## 3rd Qu.:1997 3rd Qu.: 0.0 3rd Qu.:98118 3rd Qu.:47.68
## Max. :2015 Max. :2015.0 Max. :98199 Max. :47.78
## long sqft_living15 sqft_lot15
## Min. :-122.5 Min. : 399 Min. : 651
## 1st Qu.:-122.3 1st Qu.:1490 1st Qu.: 5100
## Median :-122.2 Median :1840 Median : 7620
## Mean :-122.2 Mean :1987 Mean : 12768
## 3rd Qu.:-122.1 3rd Qu.:2360 3rd Qu.: 10083
## Max. :-121.3 Max. :6210 Max. :871200
#Let’s split the loaded dataset into train and test sets. We will use 80% of the data to train our models and 20% will be used to test the models..
set.seed(123)
ind <- sample(2, nrow(newdata), prob = c(0.8, 0.2), replace = T)
train <- newdata[ind == 1, ]
test <- newdata[ind == 2,]
dim(newdata)
## [1] 21613 12
dim(train)
## [1] 17346 12
dim(test)
## [1] 4267 12
data
## # A tibble: 21,613 × 19
## price bedrooms bathrooms sqft_living sqft_lot floors waterfront view
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 221900 3 1 1180 5650 1 0 0
## 2 538000 3 2.25 2570 7242 2 0 0
## 3 180000 2 1 770 10000 1 0 0
## 4 604000 4 3 1960 5000 1 0 0
## 5 510000 3 2 1680 8080 1 0 0
## 6 1225000 4 4.5 5420 101930 1 0 0
## 7 257500 3 2.25 1715 6819 2 0 0
## 8 291850 3 1.5 1060 9711 1 0 0
## 9 229500 3 1 1780 7470 1 0 0
## 10 323000 3 2.5 1890 6560 2 0 0
## # ℹ 21,603 more rows
## # ℹ 11 more variables: condition <dbl>, grade <dbl>, sqft_above <dbl>,
## # sqft_basement <dbl>, yr_built <dbl>, yr_renovated <dbl>, zipcode <dbl>,
## # lat <dbl>, long <dbl>, sqft_living15 <dbl>, sqft_lot15 <dbl>
str(train)
## tibble [17,346 × 12] (S3: tbl_df/tbl/data.frame)
## $ price : num [1:17346] 221900 538000 180000 1225000 257500 ...
## $ bedrooms : num [1:17346] 3 3 2 4 3 3 3 2 3 3 ...
## $ bathrooms : num [1:17346] 1 2.25 1 4.5 2.25 1 2.5 1 1 1.75 ...
## $ sqft_living: num [1:17346] 1180 2570 770 5420 1715 ...
## $ sqft_lot : num [1:17346] 5650 7242 10000 101930 6819 ...
## $ floors : num [1:17346] 1 2 1 1 2 1 2 1 1.5 1 ...
## $ waterfront : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ view : Factor w/ 5 levels "0","1","2","3",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ condition : Factor w/ 5 levels "1","2","3","4",..: 3 3 3 3 3 3 3 4 4 4 ...
## $ age : num [1:17346] 65 29 87 19 25 60 17 78 93 43 ...
## $ lat : num [1:17346] 47.5 47.7 47.7 47.7 47.3 ...
## $ long : num [1:17346] -122 -122 -122 -122 -122 ...
lm_model <- lm(price ~ .,
data = train)
lm_model
##
## Call:
## lm(formula = price ~ ., data = train)
##
## Coefficients:
## (Intercept) bedrooms bathrooms sqft_living sqft_lot floors
## -5.035e+07 -4.605e+04 3.574e+04 2.727e+02 -3.266e-02 3.629e+04
## waterfront1 view1 view2 view3 view4 condition2
## 4.792e+05 1.272e+05 8.924e+04 1.728e+05 3.622e+05 -8.403e+03
## condition3 condition4 condition5 age lat long
## -8.521e+03 2.781e+04 5.558e+04 1.156e+03 6.581e+05 -1.552e+05
summary(lm_model)
##
## Call:
## lm(formula = price ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1360891 -107996 -10354 80713 4092628
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.035e+07 1.697e+06 -29.673 <2e-16 ***
## bedrooms -4.605e+04 2.247e+03 -20.499 <2e-16 ***
## bathrooms 3.574e+04 3.889e+03 9.191 <2e-16 ***
## sqft_living 2.727e+02 3.095e+00 88.104 <2e-16 ***
## sqft_lot -3.266e-02 4.134e-02 -0.790 0.430
## floors 3.629e+04 3.819e+03 9.504 <2e-16 ***
## waterfront1 4.792e+05 2.460e+04 19.482 <2e-16 ***
## view1 1.272e+05 1.365e+04 9.321 <2e-16 ***
## view2 8.924e+04 8.158e+03 10.938 <2e-16 ***
## view3 1.728e+05 1.117e+04 15.472 <2e-16 ***
## view4 3.622e+05 1.801e+04 20.110 <2e-16 ***
## condition2 -8.403e+03 4.908e+04 -0.171 0.864
## condition3 -8.521e+03 4.564e+04 -0.187 0.852
## condition4 2.781e+04 4.564e+04 0.609 0.542
## condition5 5.558e+04 4.590e+04 1.211 0.226
## age 1.156e+03 8.479e+01 13.638 <2e-16 ***
## lat 6.581e+05 1.243e+04 52.938 <2e-16 ***
## long -1.552e+05 1.338e+04 -11.600 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 218000 on 17328 degrees of freedom
## Multiple R-squared: 0.6481, Adjusted R-squared: 0.6478
## F-statistic: 1877 on 17 and 17328 DF, p-value: < 2.2e-16
#Predict
pLm <- predict(lm_model,test)
postResample(pLm,test$price)
## RMSE Rsquared MAE
## 2.134471e+05 6.612866e-01 1.378992e+05
plLinearSimple <-test %>%
ggplot(aes(price,pLm)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of price') +
ylab('Predicted value of price')+
theme_bw()
ggplotly(plLinearSimple)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#Cross validation
x <- data.matrix(train)
y <- train$price
control <- trainControl(method = "cv",
number = 10)
lineerCV <- train(price~.,
data = train,
method = "lm",
trControl = control )
lineerCV
## Linear Regression
##
## 17346 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 15610, 15611, 15612, 15611, 15612, 15610, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 218345.4 0.6456408 137295.5
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lineerCV)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1360891 -107996 -10354 80713 4092628
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.035e+07 1.697e+06 -29.673 <2e-16 ***
## bedrooms -4.605e+04 2.247e+03 -20.499 <2e-16 ***
## bathrooms 3.574e+04 3.889e+03 9.191 <2e-16 ***
## sqft_living 2.727e+02 3.095e+00 88.104 <2e-16 ***
## sqft_lot -3.266e-02 4.134e-02 -0.790 0.430
## floors 3.629e+04 3.819e+03 9.504 <2e-16 ***
## waterfront1 4.792e+05 2.460e+04 19.482 <2e-16 ***
## view1 1.272e+05 1.365e+04 9.321 <2e-16 ***
## view2 8.924e+04 8.158e+03 10.938 <2e-16 ***
## view3 1.728e+05 1.117e+04 15.472 <2e-16 ***
## view4 3.622e+05 1.801e+04 20.110 <2e-16 ***
## condition2 -8.403e+03 4.908e+04 -0.171 0.864
## condition3 -8.521e+03 4.564e+04 -0.187 0.852
## condition4 2.781e+04 4.564e+04 0.609 0.542
## condition5 5.558e+04 4.590e+04 1.211 0.226
## age 1.156e+03 8.479e+01 13.638 <2e-16 ***
## lat 6.581e+05 1.243e+04 52.938 <2e-16 ***
## long -1.552e+05 1.338e+04 -11.600 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 218000 on 17328 degrees of freedom
## Multiple R-squared: 0.6481, Adjusted R-squared: 0.6478
## F-statistic: 1877 on 17 and 17328 DF, p-value: < 2.2e-16
#Predict
pLmCV <- predict(lineerCV,test)
postResample(pLmCV,test$price)
## RMSE Rsquared MAE
## 2.134471e+05 6.612866e-01 1.378992e+05
plLinearCV<-test %>%
ggplot(aes(price,pLmCV)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of price') +
ylab('Predicted value of price')+
theme_bw()
ggplotly(plLinearCV)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
ridge <- train(price~.,
data = train,
method = "glmnet",
tuneGrid = expand.grid(alpha = 0,
lambda = seq(0.0001,1,length=50)),
trControl = control )
pRidge <- predict(ridge,test)
postResample(pRidge,test$price)
## RMSE Rsquared MAE
## 2.138839e+05 6.602660e-01 1.350585e+05
plRidge <-test %>%
ggplot(aes(price,pRidge)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of price') +
ylab('Predicted value of price')+
theme_bw()
ggplotly(plRidge)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
lasso <- train(price~.,
data = train,
method = "glmnet",
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0.0001,1,length=50)),
trControl = control )
pLasso <- predict(lasso,test)
postResample(pLasso,test$price)
## RMSE Rsquared MAE
## 2.134865e+05 6.610694e-01 1.377348e+05
plLasoo <-test %>%
ggplot(aes(price,pLasso)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of price') +
ylab('Predicted value of price')+
theme_bw()
ggplotly(plLasoo)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
library(catboost)
#Separate x and y of train and test dataset, which will very useful when we using this in the catboost package.
library(dplyr)
y_train <- unlist(train[c('price')])
X_train <- train %>% select(-price)
y_valid <- unlist(test[c('price')])
X_valid <- test %>% select(-price)
#Convert the train and test dataset to catboost specific format using the load_pool function by mentioning x and y of both train and test.
train_pool <- catboost.load_pool(data = X_train, label = y_train)
test_pool <- catboost.load_pool(data = X_valid, label = y_valid)
#Create an input params for the CatBoost regression.
params <- list(iterations=500,
learning_rate=0.01,
depth=10,
loss_function='RMSE',
eval_metric='RMSE',
random_seed = 55,
od_type='Iter',
metric_period = 50,
od_wait=20,
use_best_model=TRUE)
modelCatboost <- catboost.train(learn_pool = train_pool,params = params)
## You should provide test set for use best model. use_best_model parameter has been switched to false value.
## 0: learn: 364735.1932310 total: 199ms remaining: 1m 39s
## 50: learn: 271078.1687112 total: 3.47s remaining: 30.5s
## 100: learn: 215954.3839468 total: 6.57s remaining: 26s
## 150: learn: 183005.6666038 total: 9.72s remaining: 22.5s
## 200: learn: 162997.0175298 total: 13.1s remaining: 19.4s
## 250: learn: 149479.6936442 total: 16.2s remaining: 16.1s
## 300: learn: 140688.4749743 total: 19.4s remaining: 12.8s
## 350: learn: 136098.3140342 total: 21.9s remaining: 9.29s
## 400: learn: 132910.1449928 total: 23.9s remaining: 5.9s
## 450: learn: 130175.8951410 total: 26s remaining: 2.83s
## 499: learn: 127939.0697817 total: 28.2s remaining: 0us
y_pred=catboost.predict(modelCatboost,test_pool)
catboostMetrics <- postResample(y_pred,test$price)
catboostMetrics
## RMSE Rsquared MAE
## 1.460674e+05 8.526002e-01 8.405154e+04
plCatboost <-test %>%
ggplot(aes(price,y_pred)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of price') +
ylab('Predicted value of price')+
theme_bw()
ggplotly(plCatboost)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'